library(here)
library(ggplot2)
library(tidyverse)
library(stringr)
library(DESeq2)
library(ggsci)
library(variancePartition)
library(patchwork)
## Define main path
main_path <- file.path("/home/grocamora/RytenLab-Research/18-DGE_limma_dream")# here::here()
## Set options
options(dplyr.summarise.inform = FALSE)
options(lifecycle_verbosity = "warning")
knitr::opts_chunk$set(echo = T, warning = F, message = F, out.width="100%",
fig.align = "center", dpi = 300,
fig.height = 7.2, fig.width = 7.2)Ryten server:
/home/grocamora/RytenLab-Research/18-DGE_limma_dream
This markdown will illustrate the output of the covariates found from the script:
/home/grocamora/RytenLab-Research/18-DGE_limma_dream/Scripts/new_covariate_correction.R
It is required to run the previous script before this markdown. The script employed Aine’s code from /home/MinaRyten/Aine/hardy_snrnaseq/scripts/02-identifying_covariates.R
The current approach:
This approach looked for a covariate with:
variancePartition \(Q3 >= 3%\)variancePartition maximum >= 75%results_path <- file.path(main_path, "results/Hardy_NewCovs")
rds.files = readRDS(file.path(results_path, "varPartCorPlots.rds"))Notes about covariate assessment from Aine’s script:
a: Heatmap to show the correlation between each numerical covariate and each principle component (PC).
b: Heatmap to show the correlation between each categorical covariate and each principal component (PC).
c: Scree plot to show the variance explained by each PC.
d: Distribution plot visualising the output of variancePartition. This shows, for each covariate, the distribution of variance explained (%) for each gene.
# slots in the 02-varPartCorPlots*.rds.files:
# 1. variancePartition distribution plot
# 2. PCA-covariate correlation heatmap
# 3. Prop variance explained by each PC
# 4. distribution of covariate colinearity
# 5. PC-meta correlations with variance explained and variance_weighted_correlation
# function to generate a 3-part figure from each rds object
generate.fig = function(l){
# l=readRDS("/home/MRAineFairbrotherBrowne/MinaRyten/Aine/hardy_snrnaseq/envs/02-varPart/02-varPartCorPlots_ACG_Astrocytes_annot1.rds")
# extract varPart plot
vp = l[[1]]
# fix elements of variancePartition plot
vp$layers[[2]]$geom_params$outlier.alpha = 0.05
vp$theme$axis.text.x$size = 8
vp$theme$axis.text.x$angle = 90
# extract heatmap
hm = l[[2]]
# extract variance explained plot
ve = l[[3]]
ve$layers[[1]]$label.size = 0.01
# # colinearity plots
# cl = l[[4]]$patches$plots[[2]] + l[[4]]$patches$plots[[1]]
fig = ((hm) / (ve | vp)) +
plot_layout(heights = c(1.5, 1.5, 1)) +
plot_annotation(tag_levels = 'a')
return(fig)
}
generate.fig(rds.files)Notes about cutoffs from Aine’s script:
variancePartition value: this gives an idea of whether this covariate comprises a very large proportion of the variation of any gene(s), in which case it important to control for thisvariancePartition distribution: this gives an idea of whether this covariate comprises a sizeable proportion of the variation of genes as a wholeThe following visualisations aim to reveal the distributions of these three metrics, enabling cut-off setting and aid in the selection covariates that are important to control for across the datasets.
dat = rds.files[[5]] %>%
dplyr::select(meta_var, name, p, padj, rho = stat, variance_weighted_correlation, var_type) %>%
dplyr::arrange(-variance_weighted_correlation) %>%
dplyr::group_by(meta_var) %>%
dplyr::filter(variance_weighted_correlation>0) %>%
dplyr::summarise(variance_weighted_correlation = max(variance_weighted_correlation),
min_p = min(p)) %>%
dplyr::ungroup() %>%
dplyr::left_join(
x=.,
y=rds.files[[5]] %>%
dplyr::select(meta_var, name, p, padj, rho = stat, variance_weighted_correlation, var_type),
by=c("meta_var", "variance_weighted_correlation")
) %>%
dplyr::rename(pc=name) %>%
dplyr::arrange(-variance_weighted_correlation) %>%
# then calculate and join variancePartition metrics
dplyr::left_join(
x=.,
y=rds.files[[1]]$data %>%
dplyr::group_by(variable) %>%
dplyr::summarise(varPart.q3 = as.numeric(quantile(value)[4]),
varPart.max = as.numeric(quantile(value)[5])),
by=c("meta_var"="variable")
) %>%
dplyr::mutate(dataset = "Hardy_Bulk") %>%
dplyr::mutate(is.sig = dplyr::case_when(
((p<=0.05) & (padj>0.05)) ~ "P<0.05",
p>0.05 ~ "NS",
padj<=0.05 ~ "FDR<0.05",
TRUE ~ ""))
p0 = dat %>%
dplyr::filter(var_type=="categorical") %>%
ggplot(data=., aes(y=reorder(meta_var, p), x=-log10(p), colour=is.sig)) +
geom_point(aes(size=as.numeric(pc)), alpha=0.7) +
geom_point() +
scale_colour_manual(values = c("FDR<0.05" = "purple", "P<0.05" = "red", "NS"="grey")) +
ylab("") +
xlab("-log10 P-value for K-W chi-squared test") +
scale_size_identity() +
geom_vline(xintercept=-log10(0.05), linetype=2, alpha=0.7) +
theme(legend.title = element_blank())
p1 = dat %>%
dplyr::filter(var_type=="continuous") %>%
ggplot(data=., aes(y=reorder(meta_var, p), x=variance_weighted_correlation, colour=is.sig)) +
geom_point(aes(size=as.numeric(pc)), alpha=0.7) +
scale_colour_manual(values = c("FDR<0.05" = "purple", "P<0.05" = "red", "NS"="grey")) +
ylab("") +
xlab("Variance explained \n -weighted Spearman's rho") +
scale_size_identity() +
geom_vline(xintercept=0.025, linetype=2, alpha=0.7) +
theme(legend.title = element_blank())
p2 = dat %>%
ggplot(data=., aes(y=reorder(meta_var, p), x=varPart.q3, colour=is.sig)) +
geom_point(size=2, alpha=0.7) +
scale_colour_manual(values = c("FDR<0.05" = "purple", "P<0.05" = "red", "NS"="grey")) +
geom_vline(xintercept=3, linetype=2, alpha=0.7) +
ylab("") +
xlab("Q3 % var explained \n (variancePartition)") +
guides(colour=FALSE)
p3 = dat %>%
ggplot(data=., aes(y=reorder(meta_var, p), x=varPart.max, colour=is.sig)) +
geom_point(size=2, alpha=0.7) +
scale_colour_manual(values = c("FDR<0.05" = "purple", "P<0.05" = "red", "NS"="grey")) +
geom_vline(xintercept=75, linetype=2, alpha=0.75) +
ylab("") +
xlab("Max % var explained \n (variancePartition)") +
guides(colour=FALSE)
p0/p1/p2/p3Notes about final covariate extracted from Aine’s script:
selected_covs = dat %>%
dplyr::group_by(meta_var) %>%
dplyr::summarise(n.PC.sig = sum((padj<=0.05 & var_type=="categorical") |
(padj<=0.05 & variance_weighted_correlation>=0.025 & var_type=="continuous")),
n.varPart.q3 = sum(varPart.q3 >= 3),
n.varPart.max = sum(varPart.max >= 75)) %>%
dplyr::ungroup() %>%
# A: passes (FDR<0.05 & variance-weighted covariate-PC cut-off AND passes
# variancePartition q3 cut-off) OR (max cut-off at any point)
dplyr::filter((n.PC.sig >= 1 & n.varPart.q3 >= 1) | (n.varPart.max >= 1)) %>%
dplyr::select(meta_var) %>%
dplyr::distinct()
selected_covs